#include "cuda.h"
#include "cuda_runtime.h"

#include <string>
#include <fstream>
#include <streambuf>
#include <stdio.h>
#include <cstdint>
#include <random>
#include <memory>

#include <nvrtc.h>

static const char *_cudaGetErrorEnum(CUresult error) {
    static char unknown[] = "<unknown>";
    const char *ret = NULL;
    cuGetErrorName(error, &ret);
    return ret ? ret : unknown;
}

#define CHECK(call)                                                                \
  do {                                                                             \
    const CUresult error = call;                                                   \
    if (error != CUDA_SUCCESS) {                                                    \
      fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                       \
      fprintf(stderr, "code: %d, reason: %s\n", error, _cudaGetErrorEnum(error)); \
      exit(1);                                                                     \
    }                                                                              \
  } while (0)

void CompileFileToBC(const std::string &file_name, char **ptxResult, size_t *ptxResultSize) {

    std::ifstream file(file_name.c_str());
    if (!file.is_open()) {
        printf("\nerror: unable to open %s for reading!\n", file_name.c_str());
        exit(EXIT_FAILURE);
    }
    std::string file_stream((std::istreambuf_iterator<char>(file)),
                    std::istreambuf_iterator<char>());

    nvrtcProgram prog{};
    if (NVRTC_SUCCESS != nvrtcCreateProgram(&prog, file_stream.c_str(), file_name.c_str()
            , 0, NULL, NULL)) {
        printf("\nerror:nvrtcCreateProgram failed\n");
        exit(EXIT_FAILURE);
    }

    int numCompileOptions = 0;
    char *compileParams[1] = {nullptr};
    if (NVRTC_SUCCESS != nvrtcCompileProgram(prog, numCompileOptions, compileParams)) {
        printf( "\nerror:nvrtcCompileProgram failed\n");
        exit(EXIT_FAILURE);
    }

    size_t ptx_size{};

    nvrtcGetPTXSize(prog, &ptx_size);

    char *ptx = reinterpret_cast<char *>(malloc(sizeof(char) * ptx_size));
    nvrtcGetPTX(prog, ptx);

    nvrtcDestroyProgram(&prog);
    *ptxResult = ptx;
    *ptxResultSize = ptx_size;
}

CUmodule loadBC(char *ptx)
{
    cuInit(0);

    CUdevice device;
    CHECK(cuDeviceGet(&device, 0));

    CUcontext context;
    CHECK(cuCtxCreate(&context, 0, device));

    CUmodule module;
    CHECK(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));

    return module;
}

int main(int argc, char **argv) {

    char *ptx;
    size_t ptxSize;
    std::string kernel_file = "./vector_add_kernel.cu";
    CompileFileToBC(kernel_file, &ptx, &ptxSize);

    CUfunction kernel_addr;
    auto module = loadBC(ptx);
    CHECK(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));

    int numElements = 50000;
    size_t size = numElements * sizeof(float);
    printf("[Vector addition of %d elements]\n", numElements);

    std::unique_ptr<float[]> a_vec(new float[size]);
    std::unique_ptr<float[]> b_vec(new float[size]);
    std::unique_ptr<float[]> c_vec(new float[size]);

    if ( a_vec == nullptr || b_vec == nullptr || c_vec == nullptr) {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < numElements; ++i) {
        a_vec[i] = rand() / static_cast<float>(RAND_MAX);
        b_vec[i] = rand() / static_cast<float>(RAND_MAX);
    }

    CUdeviceptr d_A;
    CHECK(cuMemAlloc(&d_A, size));

    CUdeviceptr d_B;
    CHECK(cuMemAlloc(&d_B, size));

    CUdeviceptr d_C;
    CHECK(cuMemAlloc(&d_C, size));

    printf("Copy input data from the host memory to the dli_cuda device\n");
    CHECK(cuMemcpyHtoD(d_A, a_vec.get(), size));
    CHECK(cuMemcpyHtoD(d_B, b_vec.get(), size));

    void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
                   reinterpret_cast<void *>(&d_C),
                   reinterpret_cast<void *>(&numElements)};

    int threads_per_block = 256;
    int blocks_per_grid = (numElements + threads_per_block - 1) / threads_per_block;
    printf("dli kernel launch with %d blocks of %d threads\n", blocks_per_grid,
           threads_per_block);
    dim3 cudaBlockSize(threads_per_block, 1, 1);
    dim3 cudaGridSize(blocks_per_grid, 1, 1);
    for(int i=0;i<10000;i++)
    {
    CHECK(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
                                   cudaGridSize.z, /* grid dim */
                                   cudaBlockSize.x, cudaBlockSize.y,
                                   cudaBlockSize.z, /* block dim */
                                   0, 0,            /* shared mem, stream */
                                   &arr[0],         /* arguments */
                                   0));
    CHECK(cuCtxSynchronize());

    //printf("Copy output data from the dli device to the host memory\n");
    CHECK(cuMemcpyDtoH(c_vec.get(), d_C, size));

    for (int i = 0; i < numElements; ++i) {
        if (fabs(a_vec[i] + b_vec[i] - c_vec[i]) > 1e-5) {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }
    }
    printf("Test PASSED\n");

    CHECK(cuMemFree(d_A));
    CHECK(cuMemFree(d_B));
    CHECK(cuMemFree(d_C));

    printf("Done\n");

    return 0;
}
